In [1]:
import numpy as np
import sklearn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib
def is_covariance(x):
# checks if x is a valid covariance matrix (symmetric and pos-def)
symmetric= np.all(x==x.T)
pos_def=np.all(np.linalg.eigvals(x) > 0)
return symmetric and pos_def
def plot_data(x,title,eigen=False):
# plot 3d data
# x must be of size (n,3)
# if eigen=True, also plot eigenvectors of cov(x), with the corresponding eigenvalue
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0],x[:,1],zs=x[:,2],alpha=0.1)
limit_max=np.max(x)+1
limit_min=np.min(x)-1
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")
ax.set_xlim(limit_min, limit_max)
ax.set_ylim(limit_min, limit_max)
ax.set_zlim(limit_min, limit_max)
ax.set_title(title)
if eigen==True:
n,features=x.shape
mu=np.mean(x,axis=0)
pca = PCA(n_components=features)
pca.fit(x)
eigenvectors=pca.components_ # each row is an eigenvector
eigenvalues=pca.explained_variance_ # eigenvalues
scale=np.abs(x).max()
#scaled_eigenvectors=eigenvectors/(eigenvalues+0.2)*scale
scaled_eigenvectors=eigenvectors*scale/4
ax.scatter([mu[0]],[mu[1]],zs=[mu[2]],color="green",s=150)
endpoints=scaled_eigenvectors+mu
for i in range(features):
ax.plot([mu[0], endpoints[i,0]], [mu[1],endpoints[i,1]], [mu[2], endpoints[i,2]], color='red', alpha=0.8, lw=2)
ax.text(endpoints[i,0],endpoints[i,1],endpoints[i,2], '$\lambda_{%d}$=%0.2f' % (i,eigenvalues[i]), size=8, zorder=1)
Generate data to simulate a dataset of samples $x$ in which all features/columns (3) could be collected. x has size $n$ by $features$.
In [2]:
#mean and std of multivariate normal dist to generate samples
mu=np.array([5,0,-2])
σ=np.array([[9,1, -1],
[1, 3, -2],
[-1, -2,2],])
if not is_covariance(σ):
print("Warning: σ is not a valid covariance matrix (not symmetric or positive definite)")
n=1000 # number of samples
x=np.random.multivariate_normal(mu,σ,size=(n,))
#plot generated data
plt.close("all")
plot_data(x,"data in original space",eigen=True)
plt.show()
Transform the generated data $x$ to a new basis. The basis is given by the eigenvectors of $cov(x)$. In this new basis, the eigenvectors are the same as the $x$, $y$, $z$ axis vectors $(1,0,0)$, $(0,1,0)$, etc.
In [3]:
pca_exact = PCA(n_components=3) # since x has 3 features, this PCA model does not do compression
pca_exact.fit(x) # calculate eigen decomposition of cov(x)
x_transformed=pca_exact.transform(x) #encode x with the eigenvectors as basis
plot_data(x_transformed,"x in natural (eigenvectors) space",eigen=True)
#save the eigenvectors and eigenvalues
eigenvectors=pca_exact.components_
eigenvalues=pca_exact.explained_variance_
Generate another dataset $y$ with the same distribution as $x$ (this is a very strong assumption!)
In [4]:
y=np.random.multivariate_normal(mu,σ,size=(n,))
plot_data(y,"y in original space",eigen=True)
Lets simulate the fact that for $y$ we can't measure all values. In this case, we will create y_missing
, which only has 2 features
In [5]:
y_missing=y[:,0:2]
plt.figure()
plt.scatter(y_missing[:,0],y_missing[:,1])
plt.title("y_missing in original space (2d)")
Out[5]:
Now, lets assume that we can recover the last feature of y_missing
using information from the eigendecomposition of $cov(x)$.
We perform PCA on $x$, but use only the two most important eigenvectors (those with bigger eigenvalues). pca_reconstruction
allows us to perform a forward transform from $R^3$ to $R^2$ (compression) and an inverse transform from $R^2$ to $R^3$ (reconstruction). This can also be interpreted as going from the canonical basis to the eigenbasis and viceversa.
In [9]:
pca_reconstruction=PCA(n_components=2)
pca_reconstruction.fit(x)
print(pca_reconstruction.components_)
print(eigenvectors)
We still can't reconstruct y
from y_missing
with pca_reconstruction
because it is encoded with the canonical basis. We first need to encode it in the eigenbasis. To do that we will augment y_missing
with a new feature with value equal to the mean of that feature (a reasonable assumption), and encode it.
In [12]:
y_augmented=np.copy(y_missing)
y3=np.zeros((n,1))+mu[2]
y_augmented=np.hstack([y_missing,y3])
y_eigen=pca_exact.transform(y_augmented)
least_eigenvalue_index=np.argmin(eigenvalues)
y_eigen_2d=y_eigen[:,np.arange(3)!=least_eigenvalue_index]
In [13]:
y_reconstructed=pca_reconstruction.inverse_transform(y_eigen_2d)
plot_data(y_reconstructed, "y_reconstructed",eigen=True)
In [14]:
mean_reconstruction_error=((y_reconstructed-y)**2).sum()/n
print(mean_reconstruction_error)